The United Nations, Department of Economic and Social Affairs published dataset presents estimates of international migrant by age, sex and origin. Estimates are presented for 1990, 1995, 2000, 2005, 2010, 2015 and 2019 and are available for all countries and areas of the world. The estimates are based on official statistics on the foreign-born or the foreign population.
(Read the data into R)
library(readr)
UN_MigrantStock <- read_csv("UN_MigrantStockByOriginAndDestination_2019/Table 1-Table 1.csv",
skip = 16)
head(UN_MigrantStock)(Remove Regions and keep only Countries)
First thing you would notice is that the first 8 rows are not countries, they are regions. This time we want to see how people are migrating from countries to countries so we can remove these rows for the regions. When you look at ‘X6’ column, it looks that those ‘region’ rows don’t have any value there.
By running the command like below to keep only the rows whose X6 column have NA
These are all regions, not countries, which means that we can safely remove these rows by adding an exclamation mark ‘!’ right before ‘is.na()’ function like below.
Now we got a dataframe of 1,624 rows with 246 columns as return with all countries only data for each year.
When you look at the columns we would notice that there are unnecessary columns like ‘Total’, ‘Other South’, etc, because we are interested in estimates of the migrants only for countries to countries. We can remove those unnecessary columns with ‘select’ command along with other unnecessary columns like below.
UN_Countries_df <- UN_Countries %>%
select(-X2, -X4, -X6, -Total, -starts_with("Other"))
head(UN_Countries_df)I’m using minus ‘-’ to delete columns and using ‘start_with’ function inside ‘select’ command to delete multiple columns whose names matche the text pattern of “Other”.
UN_Countries_rename <- UN_Countries_df %>%
rename(
year = X1,
destination_country = X3,
country_code = X5
)
head(UN_Countries_rename)Now, it’s ready to tidy this ‘matrix’-ish data form by using ‘gather’ command from tidyr package.
library(tidyr)
Countries_by_no <- gather(UN_Countries_rename, "origin_country", "migrants", 4:235, na.rm = TRUE)
head(Countries_by_no)Here we want to know which countries has the highest number of migrant by year . To do so, I would use
# group by year
# change the migrants variable into num type
num_country_df <- clean_country_df
num_country_df$migrants <- parse_number(num_country_df$migrants)
str(num_country_df)## Classes 'tbl_df', 'tbl' and 'data.frame': 78754 obs. of 4 variables:
## $ year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ destination_country: chr "Namibia" "South Africa" "Egypt" "Libya" ...
## $ origin_country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ migrants : num 64 59 237 677 254 ...
## ..- attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 345 obs. of 4 variables:
## .. ..$ row : int 496 546 597 648 1397 1419 1589 1644 1938 1956 ...
## .. ..$ col : int NA NA NA NA NA NA NA NA NA NA ...
## .. ..$ expected: chr "a number" "a number" "a number" "a number" ...
## .. ..$ actual : chr "-" "-" "-" "-" ...
by_dest_Country_df <- num_country_df %>%
group_by(year, destination_country, origin_country) %>%
summarise(total_migrants = sum(migrants)) %>%
arrange(desc(total_migrants))
by_dest_Country_dfAs we can see here, we got more than 78,700 colums. I would prefer to take a subset of the data based on some statistical analysis. To do so, we can get mean, median to set a filtering cretria. Additionally, I will add a ranking column to the dataframe to rank the countries
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 39 280 17319 2342 12168662 345
# take a subset by filtering the results to only pick total migrants > mean
sub_set_migrant <- select(filter(by_dest_Country_df, total_migrants > 20000), c(1:4))
# add ranking system desc. ordered
sub_set_migrant$Rank <- order(sub_set_migrant$total_migrants, decreasing = TRUE)
sub_set_migrantNow we get a much tider data about 7,000 columns, only 4 columns. However, we can use spread() function to group by year. to see
gather_by_year <- sub_set_migrant %>%
spread(key = "year", value = "total_migrants")%>%
arrange(Rank)
# replace NA with 0 value
gather_by_year[, 4:10][is.na(gather_by_year[, 4:10])] <- 0
gather_by_yearPlease be sure that you set working directory in Rstudio to the current working directory.
## [1] TRUE
file_to_open <- read.csv(file = "migrantByOriginDestination.csv", header = TRUE, sep = ",")
file_to_openI would prefer to explore the data quickly using the Heatmap chart. This can be done by assigning ‘destination_country’ to X-axis, ‘origin_country’ to Y-axis, and ‘migrants’ to Color.
Note This group is for countries that have total number of migrants above average ~ 13000 migrant per year. First step, we can plot a boxplot to explore the data
library(ggmap)
map_world <- map_data("world")
#map_world$region
subset_migrant <- sub_set_migrant
#recode the country name to be able to left join the two data frames
head(as.factor(subset_migrant$destination_country) %>%
levels())## [1] "Afghanistan" "Albania" "Algeria" "Andorra" "Angola"
## [6] "Argentina"
# RECODE NAMES
subset_migrant$destination_country <- recode(subset_migrant$destination_country
,'United States of America' = 'USA'
,'United Kingdom' = 'UK'
)
#rename the column to be the same as world_map
colnames(subset_migrant)[2] <- "country"
#View(new_dat)
# LEFT JOIN
map_world_joined <- left_join(map_world, subset_migrant, by = c('region' = 'country'))
ggplot() +
geom_polygon(data = map_world_joined, aes(x = long, y = lat, group = group, fill = total_migrants, cstat = "identity")) +
labs(title = 'Countries with highest "Migrant Population"'
,subtitle = "source: INSEAD, https://www.un.org/en/development/desa/population/migration/data/estimates2/estimates19.asp") +
geom_point() +
theme(text = element_text(family = "Gill Sans")
,plot.title = element_text(size = 15)
,plot.subtitle = element_text(size = 10)
,legend.text = element_text(size = 10)
)As we can see from the plot, for the top 100 destination countries that have most of the migrants populations over seven period of time. We will get more insights on 2015 and 2019 and inspect what is the increasing rate for the top 10 migration countries.
migration_15_19 = sub_set_migrant %>%
filter(total_migrants >= 3000000 & year %in% c(2010 ,2015, 2019)) %>%
select(destination_country, origin_country, year, total_migrants)%>%
arrange(desc(total_migrants))
migration_15_19ggplot(migration_15_19, aes(x=destination_country, y=total_migrants, fill = destination_country)) +
geom_bar(stat="identity", width=1, position = position_dodge()) +
labs(fill="destination country") +
labs(title="Ordered Bar Chart",
subtitle="Top destinations for migrats population - more than 3M migrants per year") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
facet_grid( ~ year) +
theme_light()We got some interesting results here, for instance, United states of America has the highest migrants population over the three time intervals. However, the total migrants that USA accepts decreased by 1 million migrants in 2019. Turkey started accepting migrants in 2019 time interval with a significant number more than 3.5M migrants per interval. As an overall observation, more countries accepting migrants throught time where 2019 interval has the the top 6 countries accepting population.
For the next steps, I am interested in investigating what is the migrants’ origin countries that migrate to the United States and Turkey.
by_origin_country <- migration_15_19 %>%
filter(destination_country %in% c("United States of America", "United Arab Emirates", "Ukraine", "Turkey", "Russian Federation", "India") & total_migrants >= 3000000) %>%
select(destination_country, origin_country, total_migrants, year)
by_origin_country## Named list()
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
ggplot(by_origin_country, aes(x = destination_country , y = total_migrants, colour = origin_country)) +
geom_point(size = 3) +
facet_grid(~ by_origin_country$year) +
labs(y = "total_migrants") +
scale_y_continuous() +
theme(axis.text.x = element_text(angle=65, vjust=0.65)) +
theme_linedraw()The majority of migrant population to the United States of America is from Mexico. Also, it seams that the Turkey accepted more than 3.7M migrant from Syrian Arab Republic in 2019 interval. The other main origin countries that have most of the population are India, Ukrain, and Russian Federation respectively with no major difference in the total population.
only_3 <- migration_15_19%>%
filter(destination_country %in% c("United States of America", "United Arab Emirates", "Turkey") & year == "2019") %>%
select(destination_country, origin_country, total_migrants)
only_3As illustrated, we can concluded that United States is the top destination for Mexicans. However, Indians like to live in the United Arab Emirates at the same time Syrians would rather to flee to Turkey
library(readr)
UN_Migrant_age <- read_csv("UN_MigrantStockByAgeAndSex_2019/Table 1-Table 1.csv", skip = 15)
UN_Migrant_ageAs the previous dataset, we need to extract only the countries not regions. To do so, we can
We now have 1,624 row with 62 columns. Now it is time to do some cleaning up.
When you look at the columns we would notice that there are unnecessary columns like ‘Total’, etc, because we are interested in estimates of the migrants age category for males and female only for countries. We can remove those unnecessary columns with ‘select’ command along with other unnecessary columns like below.
migrant_age_df <- UN_Countries_age %>%
select(-X2, -X4, -X5, -X6, -starts_with("Total"), -c(58:62))
migrant_age_dfNow we have 1624 rows with a 34 columns ### Subset the data - Age categories for both sexes
Since we are focusing here on what is the most age categories that had migrated from their origin countries to another countries. So I decided to subset the data into three subsets, both categories, males only, and females only. We are interested here in both categories.
#Use select function to fetch only the data that we interested by picking the colnames
both_sex_df <- migrant_age_df %>%
select(c("X1":"75+"))
both_sex_dfNow we have 198 columns, we can make it more tider by devide the age categories into 4 familiar categories.
| child | 0 - 19 |
|---|---|
| youth | 20 - 39 |
| ———- | ——— |
| adult | 40 - 59 |
| ———- | ——— |
| senior | 60 - 75+ |
| ———- | ——— |
we will iterate over the subset and sum each 4 column into one, but first, we need to convert the column type from char to num to be able to complete the summation.
num_age_df <- both_sex_rename
num_age_df$`0-4` <- parse_number(num_age_df$`0-4`)
num_age_df$`5-9` <- parse_number(num_age_df$`5-9`)
num_age_df$`10-14` <- parse_number(num_age_df$`10-14`)
num_age_df$`15-19` <- parse_number(num_age_df$`15-19`)
num_age_df$`20-24` <- parse_number(num_age_df$`20-24`)
num_age_df$`25-29` <- parse_number(num_age_df$`25-29`)
num_age_df$`30-34` <- parse_number(num_age_df$`30-34`)
num_age_df$`35-39` <- parse_number(num_age_df$`35-39`)
num_age_df$`40-44` <- parse_number(num_age_df$`40-44`)
num_age_df$`45-49` <- parse_number(num_age_df$`45-49`)
num_age_df$`50-54` <- parse_number(num_age_df$`50-54`)
num_age_df$`55-59` <- parse_number(num_age_df$`55-59`)
num_age_df$`60-64` <- parse_number(num_age_df$`60-64`)
num_age_df$`65-69` <- parse_number(num_age_df$`65-69`)
num_age_df$`70-74` <- parse_number(num_age_df$`70-74`)
num_age_df$`75+` <- parse_number(num_age_df$`75+`)
num_age_dfPlease be sure that you set working directory in Rstudio to the current working directory.
## [1] TRUE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 1227 6068 55709 30883 5485048 480
by_youth <- by_age%>%
filter(age %in% c("20-24", "25-29", "30-34", "35-39") & total_population >= 55709)%>%
select(year, destination_country, total_population)%>%
arrange(desc(total_population))
by_youththeme_set(theme_classic())
ggplot(by_youth, aes(x=year, y=total_population)) +
geom_point(col="tomato2", size=3) + # Draw points
geom_segment(aes(x=year,
xend=year,
y=min(total_population),
yend=max(total_population)),
linetype="dashed",
size=0.1) + # Draw dashed lines
labs(title="Dot Plot",
subtitle="Youth population over the years",
caption="source: by_youth") +
coord_flip()#check how many children
by_child <- by_age%>%
filter(age %in% c("0-4", "5-9", "10-14", "15-19") & total_population >= 55709)%>%
select(year, destination_country, total_population, age)%>%
arrange(desc(total_population))
by_child#check how many adults
by_adult <- by_age%>%
filter(age %in% c("40-44", "45-49", "50-54", "55-59") & total_population >= 55709)%>%
select(year, destination_country, total_population, age)%>%
arrange(desc(total_population))
by_adult#check how many senior
by_senior <- by_age%>%
filter(age %in% c("60-64", "65-69", "70-74", "75+") & total_population >= 55709)%>%
select(year, destination_country, total_population, age)%>%
arrange(desc(total_population))
by_seniorI used dplyr::leftjoin -> Reduce function along with merge to combine all 4 dataframes into one.
merged_age <- Reduce(function(...) merge(..., all=TRUE, sort = FALSE), list(by_child, by_youth, by_adult, by_senior))
merged_ageNow based on the previous analysis in Dataset_1, we will pick only the top countries that have majority of population to get more insights how many migrants for each age category. I will look into the top 6 destinations USA, Turkey, United Arab Emirates, India, Ukraine, and Russian Federation.
First, we need to condense the age groups into categorial factor to be easy to visualize. To do that, we can construct a for-loop to iterate over the merged_age dataframe and change the value into the category we want child, youth, adult, and senior migrants.
for (i in 1:nrow(merged_age)) {
if(merged_age[[i,4]] %in% c("0-4" , "5-9" , "10-14" ,"15-19")) {
merged_age[[i,4]] <- "child"
} else if(merged_age[[i,4]] %in% c("20-24", "25-29" ,"30-34" ,"35-39")) {
merged_age[[i,4]] <- "youth"
} else if(merged_age[[i,4]] %in% c("40-44" ,"45-49" ,"50-54" ,"55-59")) {
merged_age[[i,4]] <- "adult"
} else if(merged_age[[i,4]] %in% c("60-64", "65-69", "70-74", "75+")) {
merged_age[[i,4]] <- "senior"
}
merged_age
}
merged_age#replace NA <- 0
merged_age[is.na(merged_age)] <- 0
top_migrant_countries <- merged_age %>%
filter(destination_country %in% c("United States of America", "United Arab Emirates", "Ukraine", "Turkey", "Russian Federation", "India") & year %in% c(2010, 2015,2019) & age != 0) %>%
select(destination_country, age, total_population, year)%>%
arrange(desc(total_population))
top_migrant_countriesggplot(top_migrant_countries, aes(x=age, y=total_population, fill = destination_country)) +
geom_bar(stat="identity",position=position_dodge()) +
labs(title="Migrant age-categories during 3 intervals",
subtitle="Top migrats age-categories for the top 6 destination countries") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
facet_wrap(~year) +
theme_light()best_3 <- merged_age%>%
filter(destination_country %in% c("United States of America", "United Arab Emirates", "Turkey") & year %in% c("2019") & age != 0) %>%
select(destination_country, age, total_population, year)%>%
arrange(desc(total_population))
best_3ggplot(best_3, aes(x=age, y=total_population, fill = age)) +
geom_bar(stat="identity",position=position_dodge()) +
labs(title="Migrant age-categories distribution over the countries",
subtitle="Top migrats age-categories for the top 6 destination countries") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
facet_wrap(~ destination_country) +
theme_light()library(readr)
UN_Refugee_df <- read_csv("UN_MigrantStockTotal_2019/Table 6-Table 1.csv",
skip = 15)
UN_Refugee_dfNow we have 232 rows with a 22 columns
#Use select function to fetch only the data that we interested by picking the colnames
estimate_refugee_df <- refugee_df %>%
select(c("X2":"2019"))
estimate_refugee_dfestimate_refugee_rename <- estimate_refugee_df %>%
rename(
destination_country = X2,
)
estimate_refugee_renamenum_est_refugee <- estimate_refugee_rename
num_est_refugee$`1990` <- parse_number(num_est_refugee$`1990`)
num_est_refugee$`1995` <- parse_number(num_est_refugee$`1995`)
num_est_refugee$`2000` <- parse_number(num_est_refugee$`2000`)
num_est_refugee$`2005` <- parse_number(num_est_refugee$`2005`)
num_est_refugee$`2010` <- parse_number(num_est_refugee$`2010`)
num_est_refugee$`2015` <- parse_number(num_est_refugee$`2015`)
num_est_refugee$`2019` <- parse_number(num_est_refugee$`2019`)
num_est_refugeeby_refugee_pop <- num_est_refugee%>%
gather(year, total_estimates, c(2:8)) %>%
arrange(desc(total_estimates))
by_refugee_popPlease be sure that you set working directory in Rstudio to the current working directory.
## [1] TRUE
refugeeEstimates <- read.csv(file = "refugeeEstimates.csv", header = TRUE, sep = ",")
refugeeEstimates